library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(boot)
library(MASS)
library(corrplot)
library(tidyverse)
library(plotly)
Now we shall load the dataset Boston from MASS package and explore it.
data("Boston")
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
dim(Boston)
## [1] 506 14
The data has 14 variables and 506 measurements. Each measurement corresponds to a suburb of Boston i.e. 506 suburbs in total for this dataset. The variables are described in detail at https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/Boston.html
Data contains 12 numeric variables and 2 integer variables.
The variables arevarlabels = c("per capita crime rate","proportion of residential land zoned","proportion of industrial zones per town","towns bordering charles river N/Y","NO concentration, PP10M","average number of rooms per dwelling","proportion of units built pre-1940", "weighted means of distance to 5 Boston employment centres","index of accessibility to radial highways","property tax rate per $10,000","pupil to teacher ratio", "Proortion of blacks by town","lower status of population","median value of homes in $1000s")
selcols <- colnames(Boston)
selvars <- dplyr::select(Boston, one_of(selcols))
a=0
for(var in selvars) {
if(is.integer(var)){
a = a +1
plot <-ggplot(Boston, aes(var, ..count..)) + geom_bar() + xlab(varlabels[a]) + ylab("Number of suburbs")
print(plot)
} else {
a = a +1
plot <- qplot(var,
geom="histogram",
binwidth = ((1/7.9) * sd(var)) * 3.49,
main = paste("Histogram for", varlabels[a]) ,
xlab = (varlabels[a]),
fill=I("blue"),
col=I("red"),
alpha=I(.2))
print(plot)
}
}
cor_matrix<-cor(Boston)
corrplot(cor_matrix, method="circle", type="upper", cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
The correlation matrix shows that the strongest positive correlations are between 1. Accessibility to highways and property tax rate, 2. Industrialization and NO concentration and 3. Number of rooms per dwelling and median value of homes. The strongest negative corellations are between 1. Proportion of buildings built prior to 1940 and distances to employment centres, 2. NO concentration and proportion of buildings built prior to 1940, 3. Proportion of industrial zones and distances to employment centres and 4 Percent lower status of population and median value of owner occupied homes.
Already many patterns can be drawn from this data. Accessibility to highways means higher development and could mean higher property tax rates. More industrial zones can explain higher NO concentrations. Number of rooms per dwelling could explain the median value of homes as larger homes are more expensive. Negative relationship between percent of lower status population and median value of homes shows that richer 0suburbs have wealthier people.
boston_scaled <- scale(Boston)
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
label <- c("low", "med_low", "med_high", "high")
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, label = label)
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
After standardizing the data, the mean of all variables has been centered towards the zero and the range has decreased, but still maintaining similar proportions between them.
Now we shall split the data into train and test set.
n <- nrow(boston_scaled)
ind <- sample(n, size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
dim(train)
## [1] 404 14
dim(test)
## [1] 102 14
Drawing the LDA bi-plot. Categorical crime rate variable was used as the target and all other variables were used as predictors.
lda.fit <- lda(crime ~ ., data = train)
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(train$crime)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 3)
correct_classes <- test$crime
test <- dplyr::select(test, -crime)
lda.pred <- predict(lda.fit, newdata = test)
table(correct = correct_classes, predicted = lda.pred$class) %>% addmargins()
## predicted
## correct low med_low med_high high Sum
## low 15 5 0 0 20
## med_low 4 15 11 0 30
## med_high 0 13 10 1 24
## high 0 0 1 27 28
## Sum 19 33 22 28 102
When testing the trained model on test dataset, the model predicted correctly 9 out of 23 low crime, 15 out of 26 med_low crime, 15 out of 23 med_high crime and 30 out of 30 high crime suburbs.
data('Boston')
boston_scaled <- scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
dist_eu <- dist(boston_scaled, method = "euclidean")
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
km <-kmeans(boston_scaled, centers = 3)
pairs(boston_scaled[6:10], col = km$cluster)
Checking the optimum number of clusters and re-analyzing
set.seed(123)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(boston_scaled, k)$tot.withinss})
qplot(x = 1:k_max, y = twcss, geom = 'line')
km <-kmeans(boston_scaled, centers = 4)
pairs(boston_scaled[6:10], col = km$cluster)
I could not find a clear drop in WCSS after two so I set the number of clusters at 3.
If we draw a pairs plot, we can see that we cannot separate the neighborhoods based on two variables. Some variables are good at separating 2 clusters, but almost no combination of two variables can separate three clusters in a good way.
So why don’t we try separating the clusters based on LDA instead of pairs? I chose 4 clusters for this purpose because it gives a good speeration of data.
data('Boston')
boston_scaled <- scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
set.seed(123)
km <- kmeans(boston_scaled, centers = 4)
lda.fit <- lda(km$cluster~ ., data = boston_scaled)
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
classes <- as.numeric(km$cluster)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 7)
According to the results from the LDA plot, proportion of black people in town, nitric oxide concentrations, industrialization, tax rate and crime rate were the biggest factors separating suburbs into four categories.
Higher crime rate clustered the suburbs in cluster one. Lesser (or Higher? unclear from the variable description and formula) proportion of black people clustered the towns in the opposite direction. NO concentration, industrialization and tax rate clustered suburbs towards left, i.e. clusters 1 and 2.It can also be seen that these zones were more accessible to radial highways.
We predicted crime rate category using other variables and plotted it in 3D!
The individual points were colored in the first graph according to the crime rate category and in the second graph using k means clustering (data was clustered into 4 groups in order to match with four categories of crime rate).
Even though in the second 3D graph the data was clustered using all available data and not specifically to predict crime, note that it still does a pretty good job in separating suburbs into four groups according to crime rate.
#PLOTTING 3D GRAPH FOR TRAIN SET WITH CRIME
data('Boston')
boston_scaled <- scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
n <- nrow(boston_scaled)
ind <- sample(n, size = n * 0.8)
train <- boston_scaled[ind,]
km <- kmeans(train, centers = 4)
bins <- quantile(train$crim)
label <- c("low", "med_low", "med_high", "high")
crime <- cut(train$crim, breaks = bins, include.lowest = TRUE, label = label)
train <- data.frame(train, crime)
lda.fit <- lda(crime ~ ., data = train)
classes <- as.numeric(train$crime)
model_predictors <- dplyr::select(train, -crime)
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime)
library(plotly)
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km$cluster)